@InProceedings{CaetanoBrémSchw:2019:SkImRe,
author = "Caetano, Carlos and Br{\'e}mond, Fran{\c{c}}ois and Schwartz,
William Robson",
affiliation = "{Universidade Federal de Minas Gerais} and INRIA and {Universidade
Federal de Minas Gerais}",
title = "Skeleton Image Representation for 3D Action Recognition based on
Tree Structure and Reference Joints",
booktitle = "Proceedings...",
year = "2019",
editor = "Oliveira, Luciano Rebou{\c{c}}as de and Sarder, Pinaki and Lage,
Marcos and Sadlo, Filip",
organization = "Conference on Graphics, Patterns and Images, 32. (SIBGRAPI)",
publisher = "IEEE Computer Society",
address = "Los Alamitos",
keywords = "skeleton image representation, convolutional neural network (CNN),
3D action recognition.",
abstract = "In the last years, the computer vision research community has
studied on how to model temporal dynamics in videos to employ 3D
human action recognition. To that end, two main baseline
approaches have been researched: (i) Recurrent Neural Networks
(RNNs) with Long-Short Term Memory (LSTM); and (ii) skeleton image
representations used as input to a Convolutional Neural Network
(CNN). Although RNN approaches present excellent results, such
methods lack the ability to efficiently learn the spatial
relations between the skeleton joints. On the other hand, the
representations used to feed CNN approaches present the advantage
of having the natural ability of learning structural information
from 2D arrays (i.e., they learn spatial relations from the
skeleton joints). To further improve such representations, we
introduce the \metodosigla, a novel skeleton image representation
to be used as input to CNNs. The proposed representation has the
advantage of combining the use of reference joints and a tree
structure skeleton. While the former incorporates different
spatial relationships between the joints, the latter preserves
important spatial relations by traversing a skeleton tree with a
depth-first order algorithm. Experimental results demonstrate the
effectiveness of the proposed representation for 3D action
recognition on two datasets achieving state-of-the-art results on
the recent NTU RGB+D~120 dataset.",
conference-location = "Rio de Janeiro, RJ, Brazil",
conference-year = "28-31 Oct. 2019",
doi = "10.1109/SIBGRAPI.2019.00011",
url = "http://dx.doi.org/10.1109/SIBGRAPI.2019.00011",
language = "en",
ibi = "8JMKD3MGPEW34M/3U2JR42",
url = "http://urlib.net/ibi/8JMKD3MGPEW34M/3U2JR42",
targetfile = "SIBGRAPI2019_submitted_camera_ready.pdf",
urlaccessdate = "2024, Apr. 27"
}